import os

import dashvector
from langchain.prompts import ChatPromptTemplate
from langchain_community.chat_models.tongyi import ChatTongyi
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
from langchain_community.vectorstores.dashvector import DashVector
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain


# 我们需要同时开通 DASHSCOPE_API_KEY 和 DASHVECTOR_API_KEY
os.environ["DASHSCOPE_API_KEY"] = "sk-bb1ce341ccdf459a83a1773cb0f7444f"
os.environ["DASHVECTOR_API_KEY"] = "sk-bb1ce341ccdf459a83a1773cb0f7444f"
# DASHVECTOR_ENDPOINT 在向量检索服务控制台可以查看：
os.environ["DASHVECTOR_ENDPOINT"] = ""


# 使用阿里的通义千问 LLM
llm = ChatTongyi()

# 抓取 sora 相关数据
loader = WebBaseLoader("https://pixso.cn/designskills/what-is-sora/")
docs = loader.load()

# 使用阿里云的 DashScopeEmbeddings
# langchain 结合 DashScopeEmbeddings 官方文档 https://python.langchain.com/v0.2/docs/integrations/text_embedding/dashscope/
embeddings = DashScopeEmbeddings(
    model="text-embedding-v1",
)

# 拆分 chunks, 这里使用的是递归拆分, 需要注意这里 chunk_size 不能大于 2048,
# 因为阿里云的文本向量单行最大输入字符长度不能超过 2048
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=200
)
documents = text_splitter.split_documents(docs)

# 使用阿里云的 DashVecotr
# langchain 结合 DashVector 官方文档 https://python.langchain.com/v0.2/docs/integrations/vectorstores/dashvector/
vector = DashVector.from_documents(documents, embeddings)

# 填充 prompt template
prompt = ChatPromptTemplate.from_template("""Answer the following question based only on the provided context:

<context>
{context}
</context>

Question: {input}""")
document_chain = create_stuff_documents_chain(llm, prompt)

# 从向量数据库召回和 Sora 相关数据
retriever = vector.as_retriever()
retrieval_chain = create_retrieval_chain(retriever, document_chain)

# 调用执行
response = retrieval_chain.invoke({"input": "What is the Sora model?"})
print(response["answer"])

